{
 "metadata": {
  "name": "Text Classification"
 },
 "nbformat": 3,
 "nbformat_minor": 0,
 "worksheets": [
  {
   "cells": [
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "%pylab inline\n",
      "import numpy as np\n",
      "import pylab as pl"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n",
        "Welcome to pylab, a matplotlib-based Python environment [backend: module://IPython.kernel.zmq.pylab.backend_inline].\n",
        "For more information, type 'help(pylab)'.\n"
       ]
      }
     ],
     "prompt_number": 2
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "from sklearn.datasets import fetch_20newsgroups\n",
      "from sklearn.feature_extraction.text import TfidfVectorizer\n",
      "from sklearn.cluster import MiniBatchKMeans"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 3
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "twenty_train = fetch_20newsgroups(subset='train')\n",
      "vec = TfidfVectorizer(max_df=0.5)\n",
      "X_train = vec.fit_transform(twenty_train.data)\n",
      "y_train = twenty_train.target"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 5
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "kmeans = MiniBatchKMeans(n_clusters=1000, batch_size=1000, reassignment_ratio=0,\n",
      "                         n_init=1, max_iter=10, compute_labels=False, verbose=10)\n",
      "%time kmeans.fit(X_train)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "Init 1/1 with method: k-means++\n",
        "Inertia for init 1/1: 1692.760402"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n",
        "Minibatch iteration 1/120:mean batch inertia: 0.860088, ewa inertia: 0.860088 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n",
        "Minibatch iteration 2/120:mean batch inertia: 0.814616, ewa inertia: 0.852051 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n",
        "Minibatch iteration 3/120:mean batch inertia: 0.818794, ewa inertia: 0.846173 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n",
        "Minibatch iteration 4/120:mean batch inertia: 0.802534, ewa inertia: 0.838459 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n",
        "Minibatch iteration 5/120:mean batch inertia: 0.791585, ewa inertia: 0.830174 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n",
        "Minibatch iteration 6/120:mean batch inertia: 0.805043, ewa inertia: 0.825732 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n",
        "Minibatch iteration 7/120:mean batch inertia: 0.806815, ewa inertia: 0.822388 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n",
        "Minibatch iteration 8/120:mean batch inertia: 0.778904, ewa inertia: 0.814702 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n",
        "Minibatch iteration 9/120:mean batch inertia: 0.794126, ewa inertia: 0.811065 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n",
        "Minibatch iteration 10/120:mean batch inertia: 0.796617, ewa inertia: 0.808511 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n",
        "Minibatch iteration 11/120:mean batch inertia: 0.796310, ewa inertia: 0.806355 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n",
        "Minibatch iteration 12/120:mean batch inertia: 0.786515, ewa inertia: 0.802848 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n",
        "Minibatch iteration 13/120:mean batch inertia: 0.798948, ewa inertia: 0.802159 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n",
        "Minibatch iteration 14/120:mean batch inertia: 0.788051, ewa inertia: 0.799665 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n",
        "Minibatch iteration 15/120:mean batch inertia: 0.799689, ewa inertia: 0.799669 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n",
        "Minibatch iteration 16/120:mean batch inertia: 0.795038, ewa inertia: 0.798851 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n",
        "Minibatch iteration 17/120:mean batch inertia: 0.791957, ewa inertia: 0.797632 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n",
        "Minibatch iteration 18/120:mean batch inertia: 0.780278, ewa inertia: 0.794565 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n",
        "Minibatch iteration 19/120:mean batch inertia: 0.776511, ewa inertia: 0.791374 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n",
        "Minibatch iteration 20/120:mean batch inertia: 0.783024, ewa inertia: 0.789898 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n",
        "Minibatch iteration 21/120:mean batch inertia: 0.791340, ewa inertia: 0.790153 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n",
        "Minibatch iteration 22/120:mean batch inertia: 0.775762, ewa inertia: 0.787609 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n",
        "Minibatch iteration 23/120:mean batch inertia: 0.784420, ewa inertia: 0.787046 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n",
        "Minibatch iteration 24/120:mean batch inertia: 0.782805, ewa inertia: 0.786296 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n",
        "Minibatch iteration 25/120:mean batch inertia: 0.794891, ewa inertia: 0.787815 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n",
        "Minibatch iteration 26/120:mean batch inertia: 0.786445, ewa inertia: 0.787573 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n",
        "Minibatch iteration 27/120:mean batch inertia: 0.786700, ewa inertia: 0.787419 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n",
        "Minibatch iteration 28/120:mean batch inertia: 0.768948, ewa inertia: 0.784154 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n",
        "Minibatch iteration 29/120:mean batch inertia: 0.782136, ewa inertia: 0.783797 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n",
        "Minibatch iteration 30/120:mean batch inertia: 0.783097, ewa inertia: 0.783673 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n",
        "Minibatch iteration 31/120:mean batch inertia: 0.766241, ewa inertia: 0.780592 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n",
        "Minibatch iteration 32/120:mean batch inertia: 0.786856, ewa inertia: 0.781699 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n",
        "Minibatch iteration 33/120:mean batch inertia: 0.779011, ewa inertia: 0.781224 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n",
        "Minibatch iteration 34/120:mean batch inertia: 0.766678, ewa inertia: 0.778653 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n",
        "Minibatch iteration 35/120:mean batch inertia: 0.766586, ewa inertia: 0.776520 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n",
        "Minibatch iteration 36/120:mean batch inertia: 0.774855, ewa inertia: 0.776226 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n",
        "Minibatch iteration 37/120:mean batch inertia: 0.777257, ewa inertia: 0.776408 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n",
        "Minibatch iteration 38/120:mean batch inertia: 0.778189, ewa inertia: 0.776723 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n",
        "Minibatch iteration 39/120:mean batch inertia: 0.775110, ewa inertia: 0.776438 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n",
        "Minibatch iteration 40/120:mean batch inertia: 0.769045, ewa inertia: 0.775131 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n",
        "Minibatch iteration 41/120:mean batch inertia: 0.781836, ewa inertia: 0.776316 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n",
        "Minibatch iteration 42/120:mean batch inertia: 0.781991, ewa inertia: 0.777319 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n",
        "Minibatch iteration 43/120:mean batch inertia: 0.766242, ewa inertia: 0.775361 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n",
        "Minibatch iteration 44/120:mean batch inertia: 0.782652, ewa inertia: 0.776650 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n",
        "Minibatch iteration 45/120:mean batch inertia: 0.769341, ewa inertia: 0.775358 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n",
        "Minibatch iteration 46/120:mean batch inertia: 0.769314, ewa inertia: 0.774290 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n",
        "Minibatch iteration 47/120:mean batch inertia: 0.783279, ewa inertia: 0.775879 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n",
        "Minibatch iteration 48/120:mean batch inertia: 0.765790, ewa inertia: 0.774095 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n",
        "Minibatch iteration 49/120:mean batch inertia: 0.772857, ewa inertia: 0.773877 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n",
        "Minibatch iteration 50/120:mean batch inertia: 0.766756, ewa inertia: 0.772618 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n",
        "Minibatch iteration 51/120:mean batch inertia: 0.781116, ewa inertia: 0.774120 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n",
        "Minibatch iteration 52/120:mean batch inertia: 0.781916, ewa inertia: 0.775498 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n",
        "Minibatch iteration 53/120:mean batch inertia: 0.778335, ewa inertia: 0.775999 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n",
        "Minibatch iteration 54/120:mean batch inertia: 0.786696, ewa inertia: 0.777890 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n",
        "Minibatch iteration 55/120:mean batch inertia: 0.780289, ewa inertia: 0.778314 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n",
        "Minibatch iteration 56/120:mean batch inertia: 0.753558, ewa inertia: 0.773938 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n",
        "Minibatch iteration 57/120:mean batch inertia: 0.772628, ewa inertia: 0.773707 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n",
        "Minibatch iteration 58/120:mean batch inertia: 0.761110, ewa inertia: 0.771480 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n",
        "Minibatch iteration 59/120:mean batch inertia: 0.757480, ewa inertia: 0.769006 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n",
        "Minibatch iteration 60/120:mean batch inertia: 0.757817, ewa inertia: 0.767028 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n",
        "Minibatch iteration 61/120:mean batch inertia: 0.760001, ewa inertia: 0.765786 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n",
        "Minibatch iteration 62/120:mean batch inertia: 0.764606, ewa inertia: 0.765577 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n",
        "Minibatch iteration 63/120:mean batch inertia: 0.769783, ewa inertia: 0.766321 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n",
        "Minibatch iteration 64/120:mean batch inertia: 0.781178, ewa inertia: 0.768947 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n",
        "Minibatch iteration 65/120:mean batch inertia: 0.766600, ewa inertia: 0.768532 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n",
        "Minibatch iteration 66/120:mean batch inertia: 0.775439, ewa inertia: 0.769753 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n",
        "Minibatch iteration 67/120:mean batch inertia: 0.770475, ewa inertia: 0.769880 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n",
        "Minibatch iteration 68/120:mean batch inertia: 0.762985, ewa inertia: 0.768662 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n",
        "Minibatch iteration 69/120:mean batch inertia: 0.780256, ewa inertia: 0.770711 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n",
        "Minibatch iteration 70/120:mean batch inertia: 0.764802, ewa inertia: 0.769667 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n",
        "Minibatch iteration 71/120:mean batch inertia: 0.767075, ewa inertia: 0.769208 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n",
        "Minibatch iteration 72/120:mean batch inertia: 0.778244, ewa inertia: 0.770806 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n",
        "Converged (lack of improvement in inertia) at iteration 72/120\n",
        "CPU times: user 2min 39s, sys: 4.9 s, total: 2min 44s\n",
        "Wall time: 2min 44s\n"
       ]
      },
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 12,
       "text": [
        "MiniBatchKMeans(batch_size=1000, compute_labels=False, init='k-means++',\n",
        "        init_size=None, max_iter=10, max_no_improvement=10,\n",
        "        n_clusters=1000, n_init=1, random_state=None, reassignment_ratio=0,\n",
        "        tol=0.0, verbose=10)"
       ]
      }
     ],
     "prompt_number": 12
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "X_proj_train = np.dot(X_train, kmeans.cluster_centers_.T)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "ename": "ValueError",
       "evalue": "setting an array element with a sequence.",
       "output_type": "pyerr",
       "traceback": [
        "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m\n\u001b[0;31mValueError\u001b[0m                                Traceback (most recent call last)",
        "\u001b[0;32m<ipython-input-13-0236883009a3>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mX_proj_train\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdot\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX_train\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkmeans\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcluster_centers_\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mT\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
        "\u001b[0;31mValueError\u001b[0m: setting an array element with a sequence."
       ]
      }
     ],
     "prompt_number": 13
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "from sklearn.metrics.pairwise import safe_sparse_dot"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 14
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "X_train_proj = safe_sparse_dot(X_train, kmeans.cluster_centers_.T)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 16
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "X_train_proj.shape"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 21,
       "text": [
        "(11314, 1000)"
       ]
      }
     ],
     "prompt_number": 21
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "from sklearn.ensemble import ExtraTreesClassifier"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 18
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "%time clf = ExtraTreesClassifier(n_estimators=100, max_depth=100, n_jobs=-1, oob_score=True, bootstrap=True).fit(X_train_proj, y_train)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "CPU times: user 13.3 s, sys: 2.13 s, total: 15.5 s\n",
        "Wall time: 2min 5s\n"
       ]
      }
     ],
     "prompt_number": 32
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "clf.oob_score_"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 33,
       "text": [
        "0.67668375464026864"
       ]
      }
     ],
     "prompt_number": 33
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "from sklearn.linear_model import PassiveAggressiveClassifier\n",
      "from sklearn.grid_search import GridSearchCV"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 28
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "param_grid = {'C': [0.01, 0.1, 1, 10, 100]}\n",
      "gs = GridSearchCV(PassiveAggressiveClassifier(), param_grid, cv=3, n_jobs=-1)\n",
      "%time gs.fit(X_train_proj, y_train)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "CPU times: user 4.03 s, sys: 2.87 s, total: 6.9 s\n",
        "Wall time: 17.7 s\n"
       ]
      },
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 29,
       "text": [
        "GridSearchCV(cv=3,\n",
        "       estimator=PassiveAggressiveClassifier(C=1.0, fit_intercept=True, loss='hinge', n_iter=5,\n",
        "              n_jobs=1, random_state=None, shuffle=False, verbose=0,\n",
        "              warm_start=False),\n",
        "       fit_params={}, iid=True, loss_func=None, n_jobs=-1,\n",
        "       param_grid={'C': [0.01, 0.1, 1, 10, 100]}, pre_dispatch='2*n_jobs',\n",
        "       refit=True, score_func=None, scoring=None, verbose=0)"
       ]
      }
     ],
     "prompt_number": 29
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "gs.best_params_"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 30,
       "text": [
        "{'C': 1}"
       ]
      }
     ],
     "prompt_number": 30
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "gs.best_score_"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 31,
       "text": [
        "0.83206646632490722"
       ]
      }
     ],
     "prompt_number": 31
    }
   ],
   "metadata": {}
  }
 ]
}